"ICA is a quite powerful technique and is able (in principle) to separate independent sources linearly mixed in several sensors."
-Arnaud Delorme, Centre de Recherche Cerveau & Cognition
## Setup packages
import os # call basic shell commands from inside python
import numpy as np # numerical and array calculations
import healpy as hp # all-sky "HEALPix" map format
import pandas as pd # R-like dataframe operations
#import seaborn as sb # pretty data visualizations
#import scipy
%matplotlib inline
# needed to make plots appear in a Jupyter notebook without crashing
Example shell script to get DIRBE maps from CADE@IRAP Data mirror:
#!/bin/bash
for i in {1..10}
do
curl -fO http://cade.irap.omp.eu/documents/Ancillary/DIRBE/DIRBE_"${i}"_256.fits
done
exit 0
Note that this will get the Zodi-included maps!!! These are necessary, to see of the Zodi can be isolated via blind methods)
http://cade.irap.omp.eu/dokuwiki/doku.php?id=dirbe
dirbelist = os.listdir('data/dirbezodi/')
dirbelist
dirbelist_bands = ['1.25','2.2','3.5','4.9','12','25','60','100','140','240'] # Wavelength of bands in microns
dirbeframe = pd.DataFrame() # Initialize a blank pandas dataframe
for i in range(0,len(dirbelist)): #loop through the maps and corresponding wavelenghts
#loads one map per column, with wavelengths as labels
#you can load as 'NESTED' HEALPix with nest=True, otherwise will be loaded as 'RING' by default
dirbeframe[dirbelist_bands[i]] = hp.read_map('data/dirbezodi/'+dirbelist[i], nest=True)
dirbeframe.head()
# Visualize the input maps, for confirmation:
for map in dirbeframe.columns:
hp.mollview(
dirbeframe[map],
cmap='rainbow',
unit='MJy/sr',
title='COBE-DIRBE {} $\mu$m map at NSIDE 256'.format(map),
norm=SymLogNorm(linthresh=0.01,
linscale=1,vmin=0),
nest=True)
Note: Scaling ("whitening") can also be done inside the PCA/ICA/NMF function. Just set whiten = True
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
def scaleIt(df):
imp = Imputer(missing_values=hp.UNSEEN) # Sets up the imputer object. UNSEEN is the healpix convential badpix value
imp.fit(df.values) # Find where imputation is needed
X = imp.transform(df.values) # Actually applies the imputation
scaler = MinMaxScaler(feature_range=(0, 1)) # Sets up the scaler object
scaler.fit(X) # Gets the necessary scaling parameters (min and max)
X = scaler.transform(X) # Applies the scaling
return X, scaler #scaling parameters returned, so we can easily map back to the input units later
dirbeframe_scaled, dirbeframe_scaler = scaleIt(dirbeframe)
To keep the example simple, we use the PCA, ICA and NMF functions built into the package scikit-learn.
For a truly scientific test, we should implement the code ourselves, or carefully evaluate the source code of sklearn.
from sklearn.decomposition import PCA
def getPCA(X, n_components=2): # In principle, you can have as many components as input vectors
# Principal component analysis
pca = PCA(n_components=n_components)
S_pca_ = pca.fit(X).transform(X)
return S_pca_, pca
S_pca_, pca = getPCA(dirbeframe_scaled)
from matplotlib.colors import SymLogNorm
import matplotlib.pyplot as plt
def visComps(comps, model, labels, title_prefix = "PC_"):
for i in range(0,np.size(comps,axis=1)):
#plt.subplot(5,4,i+1)
#plt.figure(figsize=(20,20))
try:
title = title_prefix+str(i+1)+": Explained Variance = "+str(
round(model.explained_variance_ratio_[i]*100,2))+"%"
except(AttributeError):
title = title_prefix+str(i+1)
hp.mollview(comps[:,i], title= title,
cmap = "rainbow",
norm=SymLogNorm(linthresh=0.01,
linscale=1,vmin=0),
#norm='hist',
nest=True)
visComps(S_pca_, pca, dirbeframe.columns)
def plotComps(comps, model, scaler, labels, title_prefix = "PC_"):
for i in range(0,np.size(comps,axis=1)):
try:
title = title_prefix+str(i+1)+": Explained Variance = "+str(
round(model.explained_variance_ratio_[i]*100,2))+"%"
except(AttributeError):
title = title_prefix+str(i+1)
#x_ = range(0,np.size(model.components_,axis=1))
x_ = [float(l) for l in labels]
y_ = model.components_[i]
y_unscaled = (y_*scaler.data_range_)+scaler.data_min_
#fig, ax = plt.subplots()
# create the general figure
fig1 = plt.figure()
# and the first axes using subplot populated with data
ax1 = fig1.add_subplot(111)
scatter1 = ax1.scatter(x_,y_, label="Scaled", c='blue')
for i, txt in enumerate(labels):
#print x_[i], y_[i], labels[i]
ax1.annotate(labels[i], (x_[i],y_[i]))
ax1.yaxis.tick_left()
ax1.yaxis.set_label_position("left")
plt.ylabel("Scaled Components [relative contribution]")
plt.legend(fancybox=True, loc='upper left')
ax2 = fig1.add_subplot(111, frameon=False, sharex = ax1)
scatter2 = ax2.scatter(x_,y_unscaled, c='red', label="Unscaled")
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position("right")
ax2.set_ylim((np.min(y_unscaled),1e4))
ax2.set_yscale('symlog')
plt.ylabel("Unscaled Components [MJy/sr]")
plt.xlim(np.min(x_)*0.80,np.max(x_)*1.20)
plt.xscale('log')
plt.xlabel("Wavelength [$\mu$m]")
leg1 = plt.legend(fancybox=True, loc='upper right')
plt.title(title)
plt.show()
plt.close()
plotComps(S_pca_, pca, dirbeframe_scaler, dirbeframe.columns)
from sklearn.decomposition import FastICA, NMF # We go ahead and setup for Non-negative factorazition too
def doSeparation(df, method='ICA', n_components=5, randstate=42): #Combining the steps, making a general function for all 3 methods
df_scaled, scaler = scaleIt(df)
# Principal component analysis
if method=='PCA':
comps = PCA(n_components=n_components)
model = pca.fit(X).transform(X)
# Independent component analysis:
elif method == 'ICA':
rng = np.random.RandomState(randstate) # The simple "fast" ICA version needs a random number generation
model = FastICA(random_state = rng, n_components=n_components)
comps = model.fit(df_scaled).transform(df_scaled) # We can fit and transform in one line
comps /= comps.std(axis=0)
# Non-negative matrix factorization
elif method =='NMF':
model = NMF(n_components=n_components)
comps = model.fit(df_scaled).transform(df_scaled)
# Spatial visualization:
visComps(comps, model, df.columns, title_prefix=method)
# Spectral visualization:
plotComps(comps, model, scaler, df.columns, title_prefix=method)
doSeparation(dirbeframe, method='ICA', n_components=2)
doSeparation(dirbeframe, method='NMF', n_components=2)
#doSeparation(dirbeframe, method='PCA', n_components=3)
# increasing n_components for PCA doesn't change the actual process. Just how many components are kept in the end.
doSeparation(dirbeframe, method='ICA', n_components=3)
doSeparation(dirbeframe, method='NMF', n_components=3)
doSeparation(dirbeframe, method='ICA', n_components=5)
doSeparation(dirbeframe, method='NMF', n_components=5)
doSeparation(dirbeframe, method='ICA', n_components=8)
doSeparation(dirbeframe, method='NMF', n_components=8)
doSeparation(dirbeframe, method='ICA', n_components=10)
doSeparation(dirbeframe, method='NMF', n_components=10)